Abstract

When using datasets gathered from mediums designed for casual conversation such as twitter, the problem with these datasets is the large amount of sarcasm present in these datasets. With sarcasm being difficult to detect by humans and Natural Language processing models this can hinder the model’s accuracy. As a result, we hoped to create an NLP model specifically designed to detect sarcasm for other NLP models. To do this, two datasets will be utilized,ISarcasm and Tweets with Sarcasm and Irony. In addition, these datasets will be trained on with five different models; Recurrent Neural Networks (RNN), Support Vector Machine (SVM), Random Forests, Decision Trees, and XGboost. By combining our model with other NLP models, we hope to increase the accuracy of these models. 

Dataset

Visualizations

Retrieve Data

  • The Dataset already has the train and test separated so no need to do it manually
Dir <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
train <- read_csv(paste0(Dir,"/train.csv"))
## Rows: 81408 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv(paste0(Dir,"/test.csv"))
## Rows: 8128 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#test <- test %>% filter(!is.na(class))
#train$tweets <- map(train$tweets, .f = function(x){
#  str_squish(x)
#}) %>% unlist()

Some Exploration

  • Shows how many observation exists for each dataset
classes <- train$class %>% unique()
num_obs_train <- nrow(train)
num_obs_test <- nrow(test)

Classes: figurative, irony, regular, sarcasm
Number of observations in Train: 81408
Number of observations in Test: 8128

  • Demonstrates the number of different classes exists
t <- train %>% group_by(class) %>% count()
t
  • View the missing data, we consider empty strings as missing data as well
train %>% filter(tweets == "" | tweets == " " | is.na(tweets))
  • Visualize tweet length range
tweet_lengths <- train$tweets %>% map(
              .f = function(x){
  str_count(x, pattern = " ") + 1
}) %>% unlist()

ids <- 1:nrow(train)
train_temp <- train %>% mutate(tweet_length = tweet_lengths,
                               id = ids)

train_temp %>% ggplot(aes(x = tweet_lengths)) + 
  geom_bar(aes(fill = after_stat(count)))

  • Based on the graphic, it shows a normal distribution of tweet lengths
  • Tweet length range visualized with box plot
train_temp %>% ggplot(aes(y = tweet_lengths)) + 
  geom_boxplot()

*Tweet length range visualized with boxplot for each class

train_temp %>% ggplot(aes(x = class, y = tweet_lengths)) + 
  geom_boxplot()

Max tweet length: 67
Min tweet length: 1
Mean tweet length: 15.1798595

Max Tweet Length

t <- train_temp %>% filter(tweet_lengths == max(tweet_lengths))
t

Min Tweet Length

t <- train_temp %>% filter(tweet_lengths == min(tweet_lengths))
t

Visualization/transformation Functions

  • Some functions for transformation, most notable, get_hashtags_df mutates hashtags from tweets into a new column.
  • Takes in a list of strings
#------------------------------------------------------------------------------
#Function just so i don't loose my mind waiting for a function to finish
#P: Makes sure function does not print the same percentage: initialize p = 0 
#outside the loop
#Length: How long the loop is
#i: the iterator
print_percent <- function(i, length, p) {
  percent <- floor((i/length * 100))
  if(percent %% 10 == 0 && p != percent){
      print(paste0(percent,"% Complete"))
      p = percent
    }
    return(p)
}
#------------------------------------------------------------------------------
#Seperates hashtags from text
#Takes in a column of text and returns a list of hash tags
get_hashtags_df <- function(text) {
  tweets <- text
  tweets_separated <- tweets %>% str_split(pattern = " ")
  y <- list()
  p = 0
  for (i in 1:length(tweets_separated)) {
    hashtags <- list()
    for(k in 1:length(tweets_separated[[i]])){
      if(grepl(tweets_separated[[i]][k], pattern = "#.*")){
        hashtags <- append(hashtags,tweets_separated[[i]][k])
      }
    }
    #print(hashtags)
    y <- append(y,list(hashtags))
    #assign("y", y, envir = .GlobalEnv)
    #print(y)
    
    
    p = print_percent(i,length = length(tweets_separated), p = p)
    #print()#," Percent complete")
    
   #print(tweets_separated[[i]])
  }
  y
}

Read in Data

Dir <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
train <- read_csv(paste0(Dir,"/train.csv"))
## Rows: 81408 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv(paste0(Dir,"/test.csv"))
## Rows: 8128 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#test <- test %>% filter(!is.na(class))

Separate Hashtags from text into a new column

Dir <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
train <- read_csv(paste0(Dir,"/train.csv"))
#tweets <- train$tweets
y <- get_hashtags_df(train$tweets)
train$tweets <- train$tweets %>% sub(pattern = "#.* | #.*$", replacement = "")
train <- train %>% mutate(hashtags = y)
#Note to load from disk use load("Datasets/train_w_hashtags.RData") in the markdown file
save(train, file="Datasets/train_w_hashtags.RData")



test <- read_csv(paste0(Dir,"/test.csv"))
y <- get_hashtags_df(train$tweets)
train$tweets <- train$tweets %>% sub(pattern = "#.* | #.*$", replacement = "")
train <- train %>% mutate(hashtags = y)
#Note to load from disk use load("Datasets/test_w_hashtags.RData") in the markdown file
save(test, file="Datasets/test_w_hashtags.RData")
  • Preprocessing function for cleaning tweet column
preprocessing <- function(data) {
    require('tm')
    require('stopwords')
    
    data$tweets <- data$tweets %>% sub(pattern = "@.* | @.*$", replacement = "")
    data$tweets <- tolower(data$tweets)
    data$tweets <- removePunctuation(data$tweets)
    data$tweets <- removeWords(data$tweets, words = stopwords('en'))
    data <- data  %>% filter(tweets != "")
    data
}


load('Datasets/Tweets_with_Sarcasm_and_Irony/train_w_hashtags.Rdata')
train <- preprocessing(train)
train
train$hashtags %>% head()
## [[1]]
## [[1]][[1]]
## [1] "#staylight"
## 
## [[1]][[2]]
## [1] "#staywhite"
## 
## [[1]][[3]]
## [1] "#sarcastic"
## 
## [[1]][[4]]
## [1] "#moralneeded"
## 
## 
## [[2]]
## [[2]][[1]]
## [1] "#sarcasm"
## 
## [[2]][[2]]
## [1] "#people"
## 
## [[2]][[3]]
## [1] "#diy"
## 
## [[2]][[4]]
## [1] "#artattack"
## 
## 
## [[3]]
## [[3]][[1]]
## [1] "#DailyMail"
## 
## [[3]][[2]]
## [1] "#shocker"
## 
## [[3]][[3]]
## [1] "#sarcastic"
## 
## [[3]][[4]]
## [1] "#dailyfail"
## 
## [[3]][[5]]
## [1] "#inHuntspocket"
## 
## [[3]][[6]]
## [1] "#theyhatethenhs"
## 
## 
## [[4]]
## [[4]][[1]]
## [1] "#sarcasm"
## 
## 
## [[5]]
## [[5]][[1]]
## [1] "#sarcastic"
## 
## 
## [[6]]
## [[6]][[1]]
## [1] "#Irony"
## 
## [[6]][[2]]
## [1] "#TimesChange"
train <- read.csv('Datasets/Tweets_with_Sarcasm_and_Irony/train.csv')
test <- read.csv('Datasets/Tweets_with_Sarcasm_and_Irony/test.csv')
figurativeSet <- filter(train, class=="figurative")

ironySet <- filter(train, class == "irony")

sarcasmSet <- filter(train, class =="sarcasm")

regularSet <- filter(train, class =="regular")


not_regularSet <- filter(train, class != "regular")
  • Filter tweets with their classes
freq_figurative <- as.data.frame(sort(table(unlist(strsplit(figurativeSet$tweets," "))), decreasing = TRUE), stringsAsFactors = FALSE)
summary(freq_figurative)
##      Var1                Freq         
##  Length:73284       Min.   :   1.000  
##  Class :character   1st Qu.:   1.000  
##  Mode  :character   Median :   1.000  
##                     Mean   :   4.469  
##                     3rd Qu.:   1.000  
##                     Max.   :8721.000
freq_irony <- as.data.frame(sort(table(unlist(strsplit(ironySet$tweets," "))), decreasing = TRUE), stringsAsFactors = FALSE)

freq_sarcasm <- as.data.frame(sort(table(unlist(strsplit(sarcasmSet$tweets," "))), decreasing = TRUE), stringsAsFactors = FALSE)

freq_regular <- as.data.frame(sort(table(unlist(strsplit(regularSet$tweets," "))), decreasing = TRUE), stringsAsFactors = FALSE)

freq_not_regular <- as.data.frame(sort(table(unlist(strsplit(not_regularSet$tweets," "))), decreasing = TRUE), stringsAsFactors = FALSE)
  • Collect the words and their frequencies from the set

Outliers

Figurative Class Outiers

 outlierSubsetFigurative <- subset(freq_figurative, Freq > 851, stringsAsFactors = FALSE)
  outlierSubsetFigurative %>% 
    ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
        geom_bar(stat = 'identity')  +
          theme(axis.text.x = element_text(angle = 60, hjust = 1))

Irony Class Outiers

  outlierSubsetIrony <- subset(freq_irony, Freq > 851, stringsAsFactors = FALSE)
  outlierSubsetIrony %>% 
    ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
        geom_bar(stat = 'identity')  +
          theme(axis.text.x = element_text(angle = 60, hjust = 1))

Sarcasm Class Outiers

  outlierSubsetSarcasm <- subset(freq_sarcasm, Freq > 851, stringsAsFactors = FALSE)
  outlierSubsetSarcasm %>% 
    ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
        geom_bar(stat = 'identity')  +
          theme(axis.text.x = element_text(angle = 60, hjust = 1))

Regular Class Outiers

  outlierSubsetRegular <- subset(freq_regular, Freq > 851, stringsAsFactors = FALSE)
  outlierSubsetRegular %>% 
    ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
        geom_bar(stat = 'identity')  +
          theme(axis.text.x = element_text(angle = 60, hjust = 1))

frequencys <- full_join(freq_figurative,freq_irony, by = "Var1") %>%
  full_join(freq_regular, by = "Var1") %>%
  full_join(freq_sarcasm, by = "Var1") %>%
  rename(figurative = Freq.x,
         irony = Freq.y,
         regular = Freq.x.x,
         sarcasm = Freq.y.y)


frequencys_2_class <-  full_join(freq_regular,freq_not_regular, by = "Var1") %>%
   rename(regular = Freq.x,
         not_regular = Freq.y)


frequencys_2_class[frequencys_2_class == 0] <- 1
frequencys_2_class[is.na(frequencys_2_class)] <- 1


frequencys[frequencys == 0] <- 1
frequencys[is.na(frequencys)] <- 1

frequencys <- frequencys %>%
  mutate(figurative_prop = figurative/(irony * regular * sarcasm)) %>%
  mutate(irony_prop = irony/(figurative * regular * sarcasm)) %>%
  mutate(sarcasm_prop = sarcasm/(figurative * regular * irony)) %>%
  mutate(regular_prop = regular/(figurative * sarcasm * irony)) 


frequencys_2_class <- frequencys_2_class %>% 
  mutate(prop = regular/not_regular) %>%
  mutate(inv_prop = not_regular/regular)



max = 60
#graph_freq <- function(df, max_entries = 60) {
  frequencys %>% 
  arrange(desc(regular_prop)) %>%
  slice(1:max) %>%
  ggplot(aes(y = regular_prop, x = reorder(Var1, order(regular_prop, decreasing = TRUE)))) +
  geom_bar(stat='identity') +
  ggtitle("regular Outliers") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

#}

  frequencys_2_class %>% 
  arrange(desc(prop)) %>%
  slice(1:max) %>%
  ggplot(aes(y = prop, x = reorder(Var1, order(prop, decreasing = TRUE)))) +
  geom_bar(stat='identity') +
  ggtitle("Not regular Outliers") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggwordcloud Visualization

max = 60
frequencys_tmp_prop <- frequencys_2_class %>%
  arrange(desc(prop)) %>%
  slice(1:max)

frequencys_tmp_inv_prop <- frequencys_2_class %>%
  arrange(desc(inv_prop)) %>%
  slice(1:max)


ggplot(frequencys_tmp_prop, aes(label = Var1, size = prop)) +
  geom_text_wordcloud_area(eccentricity = .54, color = 'red') +
  #scale_size_area(max_size = 30) +
  theme_minimal() +  
  ggtitle ('Words found in tweets that are considered regular')

ggplot(frequencys_tmp_inv_prop, aes(label = Var1, size = inv_prop)) +
  geom_text_wordcloud_area(eccentricity = .54, color = 'blue') +
  #scale_size_area(max_size = 30) +
  theme_minimal() +  
  ggtitle ('Words found in tweets that are not considered regular (irony, figurative, sarcasm)')
## Warning in png(filename = tmp_file, width = gw_pix, height = gh_pix, res =
## dev_dpi, : 'width=12, height=12' are unlikely values in pixels
## Warning in png(filename = tmp_file, width = gw_pix, height = gh_pix, res =
## dev_dpi, : 'width=12, height=16' are unlikely values in pixels

  • These two graphs demonstrates common words that are found in classes. The first graph demonstrates the regular class and the second demonstrates the irony, figurative and sarcasm class

Models

Results

RNN

preprocessing <- function(data) {
    require('tm')
    
    data$tweets <- data$tweets %>% sub(pattern = "@.* | @.*$", replacement = "")
    data$tweets <- tolower(data$tweets)
    data$tweets <- removePunctuation(data$tweets)
    data$tweets <- removeWords(data$tweets, words = stopwords('en'))
    #data$tweets <- data$tweets[data$tweets != ""]
    data
  }
Dir = Dir_ISarcasm
train <- read.csv(paste0(Dir,"/train.csv"), fileEncoding = 'utf-8')
train <- preprocessing(train)
training_labels <- (train$class %>% as.array() %>% as.double()) 
## Warning in train$class %>% as.array() %>% as.double(): NAs introduced by
## coercion
Dir_Main <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
Dir_ISarcasm <- 'Datasets/ISarcasm'


tensorflow::tf$python$client$device_lib$list_local_devices() %>% print()
## [[1]]
## name: "/device:CPU:0"
## device_type: "CPU"
## memory_limit: 268435456
## locality {
## }
## incarnation: 9840211226849380258
## xla_global_id: -1
## 
## 
## [[2]]
## name: "/device:GPU:0"
## device_type: "GPU"
## memory_limit: 5719982080
## locality {
##   bus_id: 1
##   links {
##   }
## }
## incarnation: 7942840533864140915
## physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6"
## xla_global_id: 416903419
#-------------------------------------------------------------------
even_out_observations <- function(data){
  regular <- data %>% filter(class == 0)
  sarcasm <- data %>% filter(class == 1)
  #sarcasm$class = "sarcasm"
  num_regular <- regular %>% nrow() 
  sarcasm <- sarcasm[1:num_regular,]
  data <- rbind(regular,sarcasm)
  data <- data[sample(1:nrow(data)), ]
  data
}

#-------------------------------------------------------------------
retrieve_dataset_ISarcasm <- function(Dir = Dir_ISarcasm, binary = FALSE) {
  train <- read.csv(paste0(Dir,"/train.csv"), fileEncoding = 'utf-8')# %>% rename(tweets = tweet, class = sarcastic)
  test <- read.csv(paste0(Dir,"/test.csv"),fileEncoding = 'utf-8') #%>% rename(tweets = tweet, class = sarcastic)
  preprocessing <- function(data) {
    require('tm')
    
    data$tweets <- data$tweets %>% sub(pattern = "@.* | @.*$", replacement = "")
    data$tweets <- tolower(data$tweets)
    data$tweets <- removePunctuation(data$tweets)
    data$tweets <- removeWords(data$tweets, words = stopwords('en'))
    #data$tweets <- data$tweets[data$tweets != ""]
    data
  }
  
  
  train <- preprocessing(train)
  test <- preprocessing(test)

  factor_set <- function(set) {
    set$class[set$class == 'regular'] = 0
    set$class[set$class == 'sarcasm'] = 1
    
    if(!binary) {
      set$class[set$class == 'figurative'] = 2
      set$class[set$class == 'irony'] = 3
    } else {
      set$class[set$class == 'figurative'] = 1
      set$class[set$class == 'irony'] = 1
    }
    set
  }

  train <- factor_set(train)
  test <- factor_set(test)

  index <- createDataPartition(train$class, p = .8, list = FALSE)
  
  train <- train[index,]
  validation <- train[-index,]
  
  
  training_labels <- (train$class %>% as.array() %>% as.double()) #normalize
  validation_labels <- (validation$class %>% as.array() %>% as.double()) #normalize
  test_labels <-  (test$class %>% as.array() %>% as.double()) 
  
  
  list(train_set = train,
       train_labels = training_labels,
       test_set = test,
       test_labels = test_labels,
       validation_set = validation,
       validation_labels = validation_labels)
}

#-------------------------------------------------------------------
#load('Datasets/Tweets_with_Sarcasm_and_Irony/test_w_hashtags.RData')
retrieve_dataset <- function(Dir = 'Datasets/Tweets_with_Sarcasm_and_Irony', binary = FALSE, even_out = FALSE, without_hashtags = FALSE) {
  
  
  if(!without_hashtags){
    train <- read_csv(paste0(Dir,"/train.csv"))
    test <- read_csv(paste0(Dir,"/test.csv"))
  } else {
    train <- read_csv(paste0(Dir,"/train_without_hashtags.csv"))
    test <- read_csv(paste0(Dir,"/test_without_hashtags.csv"))
  }
  
  #load('Datasets/Tweets_with_Sarcasm_and_Irony/test_w_hashtags.RData')
  #load('Datasets/Tweets_with_Sarcasm_and_Irony/train_w_hashtags.RData')
 # train <- read_csv(paste0(Dir,"/train_without_hashtags.csv"))
  #test <- read_csv(paste0(Dir,"/test_without_hashtags.csv"))
  
  
  preprocessing <- function(data) {
    require('tm')
    data$tweets <- tolower(data$tweets)
    data$tweets <- removePunctuation(data$tweets)
    data$tweets <- removeWords(data$tweets, words = stopwords('en'))
    data
  }
  
  train <- preprocessing(train)
  test <- preprocessing(test)
  
  
  
  test <- test %>% filter(!is.na(class))
  
  factor_set <- function(set) {
    set$class[set$class == 'regular'] = 0
    set$class[set$class == 'sarcasm'] = 1
    
    if(!binary) {
      set$class[set$class == 'figurative'] = 2
      set$class[set$class == 'irony'] = 3
    } else {
      set$class[set$class == 'figurative'] = 1
      set$class[set$class == 'irony'] = 1
    }
    set
  }
  train <- factor_set(train)
  test <- factor_set(test)
  
  index <- createDataPartition(train$class, p = .8, list = FALSE)
  train <- train[index,]
  validation <- train[-index,]
  
  if(even_out && binary){
    train <- even_out_observations(train)
    test <- even_out_observations(test)
    validation <- even_out_observations(validation)
  }
   
  training_labels <- (train$class %>% as.array() %>% as.double()) #normalize
  validation_labels <- (validation$class %>% as.array() %>% as.double()) #normalize
  test_labels <-  (test$class %>% as.array() %>% as.double()) 
  
  
  
  list(train_set = train,
       train_labels = training_labels,
       test_set = test,
       test_labels = test_labels,
       validation_set = validation,
       validation_labels = validation_labels)
}
#--------------------------------------------------------------------------------
generate_sequences <- function(train_data,#training data
                               validation_data,# validation data
                               testing_data,
                               maxlen = 50,#maximum length of the embedding sequence
                               max_words = 2000,
                               tokenizer = NULL)#will only choose consider max_words amount of words for the embedding
{
  
 
  training_text <- train_data$tweets %>% as.array()#get the text
  validation_text <- validation_data$tweets %>% as.array()#get the text
  testing_text <- testing_data$tweets %>% as.array()
  
  
  if(is.null(tokenizer)) {
    tokenizer <- text_tokenizer(num_words = max_words) %>%#create and fit tokenizer
    fit_text_tokenizer(training_text)
    print('creating Tokenizer.....')
  } else {
    print('found tokenizer!')
  }
  
  sequences <- texts_to_sequences(tokenizer,training_text) #Translates text to sequences of integers(use the tokenizer$word_index to know which int maps to what word)
  training_sequences <- pad_sequences(sequences, maxlen = maxlen)#make all sequences the same length with the length being maxlen
  sequences <- texts_to_sequences(tokenizer,validation_text) #Translates text to sequences of integers(use the tokenizer$word_index to know which int maps to what word)
  validation_sequences <- pad_sequences(sequences, maxlen = maxlen)#make all sequences the same length with the length being maxlen
  sequences <- texts_to_sequences(tokenizer,testing_text)
  testing_sequences <- pad_sequences(sequences, maxlen = maxlen)
  
  
  
  list(train = training_sequences,
       validation = validation_sequences,
       test = testing_sequences,
       tokenizer = tokenizer
       )
}
#-------------------------------------------------------------------------------------------------------------------
Accuracy_Label_Table <- function (Labels, Guesses) {
  Value_P <- function(Label, Guess){
  bin <- as.integer( #Returns int equivalent of binary value Label,Guess
    strtoi(
      paste0(Label * 10 + Guess), 
      base = 2
      )
    )
  
    arr <- c("TN", #Label = 0, Guess = 0
             "FP", #Label = 0, Guess = 1
             "FN", #Label = 1, Guess = 0
             "TP" #Label = 1, Guess = 1
             )
    return(arr[bin+1])
  
  
  }
  
  result <- map2(.x = Labels, .y = Guesses,.f = Value_P) %>% unlist()

  TN_Count <- result[result == "TN"] %>% length()
  FP_Count <- result[result == "FP"] %>% length()
  FN_Count <- result[result == "FN"] %>% length()
  TP_Count <- result[result == "TP"] %>% length()
  
  
  group = c("True Negative (TN)", #Label = 0, Guess = 0
             "False Positive (FP)", #Label = 0, Guess = 1
             "False Negative (FN)", #Label = 1, Guess = 0

"True Positive (TP)" #Label = 1, Guess = 1
             )
  value = c(TN_Count,
            FP_Count,
            FN_Count,
            TP_Count)
  
  data.frame(group = group,
             value = value)
}
#-------------------------------------------------------------------------------
FP_Pie_Chart <- function(Labels, Guesses) {
  a_table <- Accuracy_Label_Table(Labels = Labels,
                     Guesses = Guesses)

  N_Acc <- round(a_table[1,2] / (a_table[1,2] + a_table[3,2]), digits = 4)
  P_Acc <- round(a_table[4,2] / (a_table[4,2] + a_table[2,2]), digits = 4)
  Acc <- round((a_table[1,2] + a_table[4,2]) / (a_table[1,2] + a_table[3,2] + a_table[4,2] + a_table[2,2]), digits = 4)
  
  plt <- a_table %>%
    ggplot(aes(x = "", y = value, fill = group)) +
    geom_col() + 
    geom_label(aes(label = value),
               position = position_stack(vjust = 0.5),
               show.legend = FALSE) +
    coord_polar(theta = "y") +
    scale_fill_manual(values = c("#FFABAB", "#FFB092",
                                 "#b4d4fa", "#BFFCC6"),
                      guide = guide_legend(reverse = TRUE)) + 
    ggtitle("TP, TN, FP, FN Pie Chart") +
    theme_void()
  
  plt <- ggdraw(plt)
  
  plt <- plt +
    annotation_custom(grob = textGrob(paste0("Accuracy Positive: ",P_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025, ymax = 1- .025) +
    annotation_custom(grob = textGrob(paste0("Accuracy Negative: ",N_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .05, ymax = 1- .025 - .05) +
    annotation_custom(grob = textGrob(paste0("Total Accuracy: ",Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .1, ymax = 1- .025 - .1)
  plt
}
#-------------------------------------------------------------------------------------------
one_hot_encode <- function(train,validation,test, max_words, tokenizer = NA) {
  
  training_text <- train %>% as.array()
  validation_text <- validation %>% as.array()
  testing_text <- test %>% as.array()
  
  if(!is.na(tokenizer)){
    tokenizer <- text_tokenizer(num_words = max_words) %>%
    fit_text_tokenizer(training_text)
  }
  
  train_one_hot_matrix <- texts_to_matrix(tokenizer, training_text, mode = "binary")#Translates text to a matrix of 0 or 1 where 0 == word NOT present and 1 == word present
  #word_index <- tokenizer$word_index #The dictionary to translate a sequence to a sentence
  validation_one_hot_matrix <- texts_to_matrix(tokenizer, validation_text, mode = "binary")
  test_one_hot_matrix <- texts_to_matrix(tokenizer, testing_text, mode = "binary")
  
  list(train = train_one_hot_matrix,
       valdiation = validation_one_hot_matrix,
       test = test_one_hot_matrix,
    tokenizer = tokenizer)
}

Datasets

Tweets with Sarcasm Data set with Hashtags
  • Click Code to show model code
max_words = 1000
embedding_dim = 8
maxlen = 50


sets <- retrieve_dataset(binary = TRUE
                         )

train <- sets$train_set
training_labels <- sets$train_labels

validation <- sets$validation_set
validation_labels <- sets$validation_labels

test <- sets$test_set
test_labels <- sets$test_labels



sequences <- generate_sequences(train,
                                validation,
                                test,
                                maxlen = maxlen,
                                max_words = max_words)
training_sequences <- sequences$train
validation_sequences <- sequences$validation
test_sequences <- sequences$test

model <-  keras_model_sequential() %>%
  layer_embedding(input_dim = max_words,
                  output_dim = embedding_dim,
                  input_length = maxlen) %>%
  bidirectional(layer_lstm(units = 128, return_sequences = TRUE))%>%
  layer_lstm(units = 64, return_sequences = FALSE) %>%
  layer_flatten() %>%
  layer_dense(units = 1, 
              activation = "sigmoid") 

model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = "accuracy"
)

history <- model %>% fit(
  training_sequences,
  training_labels,
  epochs = 5,
  batch_size = 128,
  validation_data= list(validation_sequences,validation_labels)
)


results <- model %>% evaluate(test_sequences,test_labels)
results
Predictions and visualizing accuracy
## Rows: 81408 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 8128 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] "found tokenizer!"

Tweets with Sarcasm Data set without Hashtags
  • Click Code to show the model code
max_words = 1000
embedding_dim = 8
maxlen = 50


sets <- retrieve_dataset(binary = TRUE,
                         without_hashtags = TRUE
                         )

train <- sets$train_set
training_labels <- sets$train_labels

validation <- sets$validation_set
validation_labels <- sets$validation_labels

test <- sets$test_set
test_labels <- sets$test_labels



sequences <- generate_sequences(train,
                                validation,
                                test,
                                maxlen = maxlen,
                                max_words = max_words)
training_sequences <- sequences$train
validation_sequences <- sequences$validation
test_sequences <- sequences$test

model <-  keras_model_sequential() %>%
  layer_embedding(input_dim = max_words,
                  output_dim = embedding_dim,
                  input_length = maxlen) %>%
  bidirectional(layer_lstm(units = 128, return_sequences = TRUE))%>%
  layer_lstm(units = 64, return_sequences = FALSE) %>%
  layer_flatten() %>%
  layer_dense(units = 1, 
              activation = "sigmoid") 

model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = "accuracy"
)

history <- model %>% fit(
  training_sequences,
  training_labels,
  epochs = 5,
  batch_size = 128,
  validation_data= list(validation_sequences,validation_labels)
)


results <- model %>% evaluate(test_sequences,test_labels)
results
Predictions and visualizing accuracy
## New names:
## Rows: 81408 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): tweets, class dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## New names:
## Rows: 8128 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): tweets, class dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## [1] "found tokenizer!"

ISarcasm
  • Click Code to show model code
max_words = 1000
embedding_dim = 8
maxlen = 50


sets <- retrieve_dataset_ISarcasm(binary = TRUE
                         )

train <- sets$train_set
training_labels <- sets$train_labels

validation <- sets$validation_set
validation_labels <- sets$validation_labels

test <- sets$test_set
test_labels <- sets$test_labels



sequences <- generate_sequences(train,
                                validation,
                                test,
                                maxlen = maxlen,
                                max_words = max_words)
training_sequences <- sequences$train
validation_sequences <- sequences$validation
test_sequences <- sequences$test

model <-  keras_model_sequential() %>%
  layer_embedding(input_dim = max_words,
                  output_dim = embedding_dim,
                  input_length = maxlen) %>%
  bidirectional(layer_lstm(units = 128, return_sequences = TRUE))%>%
  layer_lstm(units = 64, return_sequences = FALSE) %>%
  layer_flatten() %>%
  layer_dense(units = 1, 
              activation = "sigmoid") 

model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = "accuracy"
)

history <- model %>% fit(
  training_sequences,
  training_labels,
  epochs = 17,
  batch_size = 128,
  validation_data= list(validation_sequences,validation_labels)
)


results <- model %>% evaluate(test_sequences,test_labels)
results
Predictions and visualizing accuracy
## [1] "found tokenizer!"

SVM

Random Forest

Decision Trees

Functions

create_train_test <- function(data, size = 0.8, train = TRUE) {
  
  #Shuffle Data
  data <- data[sample(1:nrow(data)), ]
  
  
  n_row = nrow(data)
  total_row = size * n_row
  train_sample <- 1: total_row
  if (train == TRUE) {
    return (data[train_sample, ])
  } else {
    return (data[-train_sample, ])
  }
}

Import

test <- read_csv("Datasets/Tweets_with_Sarcasm_and_Irony/test_without_hashtags.csv")
## New names:
## Rows: 8128 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): tweets, class dbl (1): ...1
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
train <- read.csv("Datasets/Tweets_with_Sarcasm_and_Irony/train_without_hashtags.csv")

Preprocessing Data

test <- select(test, class, tweets)
train <- select(train, class,tweets)

#Run these two lines if you are only using sarcasm and regular in the model
train[train == "figurative"] <- "sarcasm"
train[train == "irony"] <- "sarcasm"
corpus <- Corpus(VectorSource(train$tweets))

corpus <- tm_map(corpus,PlainTextDocument)
## Warning in tm_map.SimpleCorpus(corpus, PlainTextDocument): transformation drops
## documents
corpus <- tm_map(corpus,tolower)
## Warning in tm_map.SimpleCorpus(corpus, tolower): transformation drops documents
corpus <- tm_map(corpus,removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus,removeWords,stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
corpus <- tm_map(corpus,stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
freq <- DocumentTermMatrix(corpus)
#Remove rare words
sparse <- removeSparseTerms(freq,.995)
#Create a matrix of the words
tSparse <- as.data.frame(as.matrix(sparse))
colnames(tSparse) = make.names(colnames(tSparse))
tSparse$class = train$class
#Split the data
trainSet <- create_train_test(tSparse,.8,train = T)
testSet <- create_train_test(tSparse,.8,train = F)
#Check the proportion of each class
prop.table(table(tSparse$class))
## 
##   regular   sarcasm 
## 0.2284174 0.7715826

Model

#Make the Model
#This is where you tweak the parameters to get the tree you want. This is also where the pruning is done.
fit <- rpart(class~., data = trainSet,
             method = "class",
             minsplit = 28,
             cp = .00045,
             maxsurrogate = 1)
#Shows the significance of the words it is using in the model
printcp(fit)
## 
## Classification tree:
## rpart(formula = class ~ ., data = trainSet, method = "class", 
##     minsplit = 28, cp = 0.00045, maxsurrogate = 1)
## 
## Variables actually used in tree construction:
##  [1] cant    drug    get     ironi   just    late    like    love    money  
## [10] peopl   polit   run     sarcasm thank   that   
## 
## Root node error: 14970/65126 = 0.22986
## 
## n= 65126 
## 
##           CP nsplit rel error  xerror      xstd
## 1 0.01249165      0   1.00000 1.00000 0.0071725
## 2 0.00096192      1   0.98751 0.98751 0.0071409
## 3 0.00055667      6   0.98270 0.98283 0.0071289
## 4 0.00046760     13   0.97876 0.98243 0.0071279
## 5 0.00045000     15   0.97782 0.98069 0.0071234

Visualiztion

rpart.plot(fit)

Testing

#Making the prediction
prediction <- predict(fit, testSet, type = "class")
#prediction
#Testing the prediction against the training set
table_mat <- table(testSet$class, prediction)
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
## [1] 0.7765631

XgBoost

Models

Tweets with Sarcasm with Hashtags(regular, sarcasm, figurative, irony)
library('xgboost')
## Warning: package 'xgboost' was built under R version 4.2.3
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
library('tm')
library('pacman')
## Warning: package 'pacman' was built under R version 4.2.3
library('tidyverse')
test <- read.csv("Datasets/Tweets_with_Sarcasm_and_Irony/test.csv")
train <- read.csv("Datasets/Tweets_with_Sarcasm_and_Irony/train.csv")
train <- train %>% filter(class!="")
create_train_test <- function(data_size, size = 0.8, train = TRUE) {
  set.seed(123) # Set seed for reproducibility
  
  # Create shuffled indices
  shuffled_indices <- sample(1:data_size)
  
  # Calculate the number of rows for the train set
  train_rows <- round(size * data_size)
  
  if (train == TRUE) {
    return (shuffled_indices[1:train_rows])
  } else {
    return (shuffled_indices[(train_rows + 1):data_size])
  }
}

#CSC Project
pacman::p_load(datasets,pacman, dplyr, GGally, ggplot2, ggthemes, ggvis,
               httr, lubridate, plotly, rio, rmarkdown, shiny,
               stringr, tidyverse, lessR, aplpack, readr, tm, SnowballC, rpart.plot)
## Installing package into 'C:/Users/tyele/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'ggvis' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tyele\AppData\Local\Temp\RtmpIBm15e\downloaded_packages
## 
## ggvis installed
## Warning: package 'ggvis' was built under R version 4.2.3
## Installing package into 'C:/Users/tyele/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'plotly' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tyele\AppData\Local\Temp\RtmpIBm15e\downloaded_packages
## 
## plotly installed
## Warning: package 'plotly' was built under R version 4.2.3
## Installing package into 'C:/Users/tyele/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## also installing the dependency 'htmltools'
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.2/PACKAGES'
## package 'htmltools' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'htmltools'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\tyele\AppData\Local\R\win-library\4.2\00LOCK\htmltools\libs\x64\htmltools.dll
## to
## C:\Users\tyele\AppData\Local\R\win-library\4.2\htmltools\libs\x64\htmltools.dll:
## Permission denied
## Warning: restored 'htmltools'
## package 'shiny' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tyele\AppData\Local\Temp\RtmpIBm15e\downloaded_packages
## 
## shiny installed
## Warning: package 'shiny' was built under R version 4.2.3
#class and tweets
#-----------------------------------------------------------------------------------------------------------------------------
#Preprocessing train
corpus <- Corpus(VectorSource(train$tweets))
corpus <- tm_map(corpus,PlainTextDocument)
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus,removePunctuation)
corpus <- tm_map(corpus,removeWords,stopwords("english"))
corpus <- tm_map(corpus,stemDocument)
freq <- DocumentTermMatrix(corpus)
sparse <- removeSparseTerms(freq,.995)
tSparse <- as.data.frame(as.matrix(sparse))
colnames(tSparse) = make.names(colnames(tSparse))
tSparse$class = train$class

#Prepare data for XGBoost
train_indices <- create_train_test(nrow(tSparse), 0.8, train = TRUE)
test_indices <- create_train_test(nrow(tSparse), 0.8, train = FALSE)

tSparse$class <- as.factor(tSparse$class)
tSparse$class <- as.numeric(tSparse$class) - 1

trainSet <- xgb.DMatrix(data.matrix(tSparse[train_indices, -ncol(tSparse)]), label = tSparse$class[train_indices])
testSet <- xgb.DMatrix(data.matrix(tSparse[test_indices, -ncol(tSparse)]), label = tSparse$class[test_indices])

#Set up XGBoost parameters
params <- list(
  objective = "binary:logistic",
  eval_metric = "error"
)

#Train the XGBoost model
unique(tSparse$class)
## [1] 0 1 2 3
num_classes <- length(unique(tSparse$class))

params <- list(
  objective = "multi:softprob",
  eval_metric = "mlogloss",
  num_class = num_classes,
  eta = 0.3,
  max_depth = 6,
  min_child_weight = 1,
  subsample = 1,
  colsample_bytree = 1,
  gamma = 0
)

fit <- xgb.train(params,
                 data = trainSet,
                 nrounds = 100,
                 watchlist = list(test = testSet),
                 early_stopping_rounds = 10,
                 maximize = FALSE,
                 print_every_n = 10
)
## [1]  test-mlogloss:0.853831 
## Will train until test_mlogloss hasn't improved in 10 rounds.
## 
## [11] test-mlogloss:0.033435 
## [21] test-mlogloss:0.001681 
## [31] test-mlogloss:0.000111 
## [41] test-mlogloss:0.000035 
## [51] test-mlogloss:0.000033 
## [61] test-mlogloss:0.000033 
## Stopping. Best iteration:
## [54] test-mlogloss:0.000033
#Predict the test set
prediction_prob <- predict(fit, testSet, output_margin = TRUE)
prediction <- matrix(prediction_prob, ncol = num_classes, byrow = TRUE)
prediction <- max.col(prediction) - 1

true_labels <- tSparse$class[test_indices]
mean(true_labels == prediction)
## [1] 1
Confusion_Matrix <- function (Labels, Guesses) {
  Value_P <- function(Label, Guess){
    bin <- as.integer( #Returns int equivalent of binary value Label,Guess
      strtoi(
        paste0(Label * 10 + Guess), 
        base = 2
        )
      )
    
      arr <- c("TN",
               "FP", #Label = 0, Guess = 1
               "FN", #Label = 1, Guess = 0
               "TP" #Label = 1, Guess = 1
               )
    return(arr[bin+1])
  }
  
  result <- map2(.x = Labels, .y = Guesses,.f = Value_P) %>% unlist()
  TN_Count <- result[result == "TN"] %>% length()
  FP_Count <- result[result == "FP"] %>% length()
  FN_Count <- result[result == "FN"] %>% length()
  TP_Count <- result[result == "TP"] %>% length()
  
  # Return confusion matrix
  return(matrix(c(TN_Count, FP_Count, FN_Count, TP_Count), nrow = 2, byrow = TRUE))
}

confusion_matrix <- Confusion_Matrix(Labels = true_labels, Guesses = prediction)
print(confusion_matrix)
##       [,1]  [,2]
## [1,] 12086  7880
## [2,]  7880 12076
TN <- confusion_matrix[1, 1]
FP <- confusion_matrix[1, 2]
FN <- confusion_matrix[2, 1]
TP <- confusion_matrix[2, 2]

specificity <- TN / (TN + FP)
sensitivity <- TP / (TP + FN)

cat("Specificity:", specificity, "\n")
## Specificity: 0.6053291
cat("Sensitivity:", sensitivity, "\n")
## Sensitivity: 0.6051313
#-------------------------------------------------------------------------------
FP_Pie_Chart <- function(Labels, Guesses) {
  require('cowplot')
  require('ggrepel')
  require('grid')
  a_table <- Accuracy_Label_Table(Labels = Labels,
                     Guesses = Guesses)
  N_Acc <- round(a_table[1,2] / (a_table[1,2] + a_table[3,2]), digits = 4)
  P_Acc <- round(a_table[4,2] / (a_table[4,2] + a_table[2,2]), digits = 4)
  Acc <- round((a_table[1,2] + a_table[4,2]) / (a_table[1,2] + a_table[3,2] + a_table[4,2] + a_table[2,2]), digits = 4)

  plt <- a_table %>%
    ggplot(aes(x = "", y = value, fill = group)) +
    geom_col() + 
    geom_label(aes(label = value),
               position = position_stack(vjust = 0.5),
               show.legend = FALSE) +
    coord_polar(theta = "y") +
    scale_fill_manual(values = c("#FFABAB", "#FFB092",
                                 "#b4d4fa", "#BFFCC6"),
                      guide = guide_legend(reverse = TRUE)) + 
    ggtitle("TP, TN, FP, FN Pie Chart") +
    theme_void()

  plt <- ggdraw(plt)

  plt <- plt +
    annotation_custom(grob = textGrob(paste0("Accuracy Positive: ",P_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025, ymax = 1- .025) +
    annotation_custom(grob = textGrob(paste0("Accuracy Negative: ",N_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .05, ymax = 1- .025 - .05) +
    annotation_custom(grob = textGrob(paste0("Total Accuracy: ",Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .1, ymax = 1- .025 - .1)
  plt
}

#FP_Pie_Chart(Labels = true_labels, Guesses = prediction)
Tweets with Sarcasm with Hashtags(regular, sarcasm)
library('xgboost')
library('tm')
library('pacman')
library('tidyverse')
test <- read.csv("Datasets/Tweets_with_Sarcasm_and_Irony/test.csv")
train <- read.csv("Datasets/Tweets_with_Sarcasm_and_Irony/train.csv")
train <- train %>% filter(class!="")
train$class[train$class == "figurative"] = "sarcasm"
train$class[train$class == "irony"] = "sarcasm"
create_train_test <- function(data_size, size = 0.8, train = TRUE) {
  set.seed(123) # Set seed for reproducibility
  
  # Create shuffled indices
  shuffled_indices <- sample(1:data_size)
  
  # Calculate the number of rows for the train set
  train_rows <- round(size * data_size)
  
  if (train == TRUE) {
    return (shuffled_indices[1:train_rows])
  } else {
    return (shuffled_indices[(train_rows + 1):data_size])
  }
}

#CSC Project
#pacman::p_load(datasets,pacman, dplyr, GGally, ggplot2, ggthemes, ggvis,
#               httr, lubridate, plotly, rio, rmarkdown, shiny,
 #              stringr, tidyverse, lessR, aplpack, readr, tm, SnowballC, rpart.plot)
#class and tweets
#-----------------------------------------------------------------------------------------------------------------------------
#Preprocessing train
corpus <- Corpus(VectorSource(train$tweets))
corpus <- tm_map(corpus,PlainTextDocument)
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus,removePunctuation)
corpus <- tm_map(corpus,removeWords,stopwords("english"))
corpus <- tm_map(corpus,stemDocument)
freq <- DocumentTermMatrix(corpus)
sparse <- removeSparseTerms(freq,.995)
tSparse <- as.data.frame(as.matrix(sparse))
colnames(tSparse) = make.names(colnames(tSparse))
tSparse$class = train$class

#Prepare data for XGBoost
train_indices <- create_train_test(nrow(tSparse), 0.8, train = TRUE)
test_indices <- create_train_test(nrow(tSparse), 0.8, train = FALSE)

tSparse$class <- as.factor(tSparse$class)
tSparse$class <- as.numeric(tSparse$class) - 1

trainSet <- xgb.DMatrix(data.matrix(tSparse[train_indices, -ncol(tSparse)]), label = tSparse$class[train_indices])
testSet <- xgb.DMatrix(data.matrix(tSparse[test_indices, -ncol(tSparse)]), label = tSparse$class[test_indices])

#Set up XGBoost parameters
params <- list(
  objective = "binary:logistic",
  eval_metric = "error"
)

#Train the XGBoost model
unique(tSparse$class)
## [1] 1 0
num_classes <- length(unique(tSparse$class))

params <- list(
  objective = "multi:softprob",
  eval_metric = "mlogloss",
  num_class = num_classes,
  eta = 0.3,
  max_depth = 6,
  min_child_weight = 1,
  subsample = 1,
  colsample_bytree = 1,
  gamma = 0
)

fit <- xgb.train(params,
                 data = trainSet,
                 nrounds = 100,
                 watchlist = list(test = testSet),
                 early_stopping_rounds = 10,
                 maximize = FALSE,
                 print_every_n = 10
)
## [1]  test-mlogloss:0.437501 
## Will train until test_mlogloss hasn't improved in 10 rounds.
## 
## [11] test-mlogloss:0.016627 
## [21] test-mlogloss:0.000832 
## [31] test-mlogloss:0.000052 
## [41] test-mlogloss:0.000018 
## [51] test-mlogloss:0.000015 
## [61] test-mlogloss:0.000015 
## Stopping. Best iteration:
## [55] test-mlogloss:0.000015
#Predict the test set
prediction_prob <- predict(fit, testSet, output_margin = TRUE)
prediction <- matrix(prediction_prob, ncol = num_classes, byrow = TRUE)
prediction <- max.col(prediction) - 1

true_labels <- tSparse$class[test_indices]
mean(true_labels == prediction)
## [1] 1
Accuracy_Label_Table <- function (Labels, Guesses) {
  Value_P <- function(Label, Guess){
  bin <- as.integer( #Returns int equivalent of binary value Label,Guess
    strtoi(
      paste0(Label * 10 + Guess), 
      base = 2
      )
    )
  
    arr <- c("TN",
             "FP", #Label = 0, Guess = 1
             "FN", #Label = 1, Guess = 0
             "TP" #Label = 1, Guess = 1
             )
  return(arr[bin+1])
}

result <- map2(.x = Labels, .y = Guesses,.f = Value_P) %>% unlist()
TN_Count <- result[result == "TN"] %>% length()
FP_Count <- result[result == "FP"] %>% length()
FN_Count <- result[result == "FN"] %>% length()
TP_Count <- result[result == "TP"] %>% length()

group = c("True Negative (TN)", #Label = 0, Guess = 0
          "False Positive (FP)", #Label = 0, Guess = 1
          "False Negative (FN)", #Label = 1, Guess = 0
          "True Positive (TP)" #Label = 1, Guess = 1
          )
value = c(TN_Count,
          FP_Count,
          FN_Count,
          TP_Count)

data.frame(group = group,
           value = value)
}
#-------------------------------------------------------------------------------
FP_Pie_Chart <- function(Labels, Guesses) {
  require('cowplot')
  require('ggrepel')
  require('grid')
  a_table <- Accuracy_Label_Table(Labels = Labels,
                     Guesses = Guesses)
  N_Acc <- round(a_table[1,2] / (a_table[1,2] + a_table[3,2]), digits = 4)
  P_Acc <- round(a_table[4,2] / (a_table[4,2] + a_table[2,2]), digits = 4)
  Acc <- round((a_table[1,2] + a_table[4,2]) / (a_table[1,2] + a_table[3,2] + a_table[4,2] + a_table[2,2]), digits = 4)

  plt <- a_table %>%
    ggplot(aes(x = "", y = value, fill = group)) +
    geom_col() + 
    geom_label(aes(label = value),
               position = position_stack(vjust = 0.5),
               show.legend = FALSE) +
    coord_polar(theta = "y") +
    scale_fill_manual(values = c("#FFABAB", "#FFB092",
                                 "#b4d4fa", "#BFFCC6"),
                      guide = guide_legend(reverse = TRUE)) + 
    ggtitle("TP, TN, FP, FN Pie Chart") +
    theme_void()

  plt <- ggdraw(plt)

  plt <- plt +
    annotation_custom(grob = textGrob(paste0("Accuracy Positive: ",P_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025, ymax = 1- .025) +
    annotation_custom(grob = textGrob(paste0("Accuracy Negative: ",N_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .05, ymax = 1- .025 - .05) +
    annotation_custom(grob = textGrob(paste0("Total Accuracy: ",Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .1, ymax = 1- .025 - .1)
  plt
}

FP_Pie_Chart(Labels = true_labels, Guesses = prediction)
## Loading required package: ggrepel

ISarcasm dataset
library('xgboost')
library('tm')
library('pacman')
library('tidyverse')
test <- read.csv("Datasets/ISarcasm/test.csv")
train <- read.csv("Datasets/ISarcasm/train.csv")
train <- train %>% filter(class!="")
train$class[train$class == "figurative"] = "sarcasm"
train$class[train$class == "irony"] = "sarcasm"
create_train_test <- function(data_size, size = 0.8, train = TRUE) {
  set.seed(123) # Set seed for reproducibility
  
  # Create shuffled indices
  shuffled_indices <- sample(1:data_size)
  
  # Calculate the number of rows for the train set
  train_rows <- round(size * data_size)
  
  if (train == TRUE) { 
    return (shuffled_indices[1:train_rows])
  } else {
    return (shuffled_indices[(train_rows + 1):data_size])
  }
}

#CSC Project
#pacman::p_load(datasets,pacman, dplyr, GGally, ggplot2, ggthemes, ggvis,
#               httr, lubridate, plotly, rio, rmarkdown, shiny,
#               stringr, tidyverse, lessR, aplpack, readr, tm, SnowballC, rpart.plot)
#class and tweets
#-----------------------------------------------------------------------------------------------------------------------------
#Preprocessing train
corpus <- Corpus(VectorSource(train$tweets))
corpus <- tm_map(corpus,PlainTextDocument)
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus,removePunctuation)
corpus <- tm_map(corpus,removeWords,stopwords("english"))
corpus <- tm_map(corpus,stemDocument)
freq <- DocumentTermMatrix(corpus)
sparse <- removeSparseTerms(freq,.995)
tSparse <- as.data.frame(as.matrix(sparse))
colnames(tSparse) = make.names(colnames(tSparse))
tSparse$class = train$class

#Prepare data for XGBoost
train_indices <- create_train_test(nrow(tSparse), 0.8, train = TRUE)
test_indices <- create_train_test(nrow(tSparse), 0.8, train = FALSE)

tSparse$class <- as.factor(tSparse$class)
tSparse$class <- as.numeric(tSparse$class) - 1

trainSet <- xgb.DMatrix(data.matrix(tSparse[train_indices, -ncol(tSparse)]), label = tSparse$class[train_indices])
testSet <- xgb.DMatrix(data.matrix(tSparse[test_indices, -ncol(tSparse)]), label = tSparse$class[test_indices])

#Set up XGBoost parameters
params <- list(
  objective = "binary:logistic",
  eval_metric = "error"
)

#Train the XGBoost model
unique(tSparse$class)
## [1] 1 0
num_classes <- length(unique(tSparse$class))

params <- list(
  objective = "multi:softprob",
  eval_metric = "mlogloss",
  num_class = num_classes,
  eta = 0.3,
  max_depth = 6,
  min_child_weight = 1,
  subsample = 1,
  colsample_bytree = 1,
  gamma = 0
)

fit <- xgb.train(params,
                 data = trainSet,
                 nrounds = 100,
                 watchlist = list(test = testSet),
                 early_stopping_rounds = 10,
                 maximize = FALSE,
                 print_every_n = 10
)
## [1]  test-mlogloss:0.437795 
## Will train until test_mlogloss hasn't improved in 10 rounds.
## 
## [11] test-mlogloss:0.016941 
## [21] test-mlogloss:0.001089 
## [31] test-mlogloss:0.000385 
## [41] test-mlogloss:0.000329 
## [51] test-mlogloss:0.000329 
## Stopping. Best iteration:
## [43] test-mlogloss:0.000329
#Predict the test set
#prediction_prob <- predict(fit, testSet, output_margin = TRUE)
#prediction <- matrix(prediction_prob, ncol = num_classes, byrow = TRUE)
#prediction <- max.col(prediction) - 1

# Predict the probabilities of each class for the test set
predicted_probs <- predict(fit, testSet, output_margin = TRUE)

# Reshape the vector of probabilities into a matrix
predicted_probs_matrix <- matrix(predicted_probs, ncol = num_classes, byrow = TRUE)

# Find the class with the maximum probability for each observation
predicted_class_indices <- apply(predicted_probs_matrix, 1, which.max) - 1


true_labels <- tSparse$class[test_indices]
mean(true_labels == predicted_class_indices)
## [1] 1
Accuracy_Label_Table <- function (Labels, Guesses) {
  Value_P <- function(Label, Guess){
  bin <- as.integer( #Returns int equivalent of binary value Label,Guess
    strtoi(
      paste0(Label * 10 + Guess), 
      base = 2
      )
    )
  
    arr <- c("TN",
             "FP", #Label = 0, Guess = 1
             "FN", #Label = 1, Guess = 0
             "TP" #Label = 1, Guess = 1
             )
  return(arr[bin+1])
}

result <- map2(.x = Labels, .y = Guesses,.f = Value_P) %>% unlist()
TN_Count <- result[result == "TN"] %>% length()
FP_Count <- result[result == "FP"] %>% length()
FN_Count <- result[result == "FN"] %>% length()
TP_Count <- result[result == "TP"] %>% length()

group = c("True Negative (TN)", #Label = 0, Guess = 0
          "False Positive (FP)", #Label = 0, Guess = 1
          "False Negative (FN)", #Label = 1, Guess = 0
          "True Positive (TP)" #Label = 1, Guess = 1
          )
value = c(TN_Count,
          FP_Count,
          FN_Count,
          TP_Count)

data.frame(group = group,
           value = value)
}
#-------------------------------------------------------------------------------
FP_Pie_Chart <- function(Labels, Guesses) {
  require('cowplot')
  require('ggrepel')
  require('grid')
  a_table <- Accuracy_Label_Table(Labels = Labels,
                     Guesses = Guesses)
  N_Acc <- round(a_table[1,2] / (a_table[1,2] + a_table[3,2]), digits = 4)
  P_Acc <- round(a_table[4,2] / (a_table[4,2] + a_table[2,2]), digits = 4)
  Acc <- round((a_table[1,2] + a_table[4,2]) / (a_table[1,2] + a_table[3,2] + a_table[4,2] + a_table[2,2]), digits = 4)

  plt <- a_table %>%
    ggplot(aes(x = "", y = value, fill = group)) +
    geom_col() + 
    geom_label(aes(label = value),
               position = position_stack(vjust = 0.5),
               show.legend = FALSE) +
    coord_polar(theta = "y") +
    scale_fill_manual(values = c("#FFABAB", "#FFB092",
                                 "#b4d4fa", "#BFFCC6"),
                      guide = guide_legend(reverse = TRUE)) + 
    ggtitle("TP, TN, FP, FN Pie Chart") +
    theme_void()

  plt <- ggdraw(plt)

  plt <- plt +
    annotation_custom(grob = textGrob(paste0("Accuracy Positive: ",P_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025, ymax = 1- .025) +
    annotation_custom(grob = textGrob(paste0("Accuracy Negative: ",N_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .05, ymax = 1- .025 - .05) +
    annotation_custom(grob = textGrob(paste0("Total Accuracy: ",Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .1, ymax = 1- .025 - .1)
  plt
}

FP_Pie_Chart(Labels = true_labels, Guesses = predicted_class_indices)

Tweets with Sarcasm without Hashtags(regular, sarcasm, figurative, irony)
library('xgboost')
library('tm')
library('pacman')
library('tidyverse')
test <- read.csv("Datasets/Tweets_with_Sarcasm_and_Irony/test_without_hashtags.csv")
train <- read.csv("Datasets/Tweets_with_Sarcasm_and_Irony/train_without_hashtags.csv")
train <- train %>% filter(class!="")
create_train_test <- function(data_size, size = 0.8, train = TRUE) {
  set.seed(123) # Set seed for reproducibility
  
  # Create shuffled indices
  shuffled_indices <- sample(1:data_size)
  
  # Calculate the number of rows for the train set
  train_rows <- round(size * data_size)
  
  if (train == TRUE) {
    return (shuffled_indices[1:train_rows])
  } else {
    return (shuffled_indices[(train_rows + 1):data_size])
  }
}

#CSC Project
#pacman::p_load(datasets,pacman, dplyr, GGally, ggplot2, ggthemes, ggvis,
#               httr, lubridate, plotly, rio, rmarkdown, shiny,
#               stringr, tidyverse, lessR, aplpack, readr, tm, SnowballC, rpart.plot)
#class and tweets
#-----------------------------------------------------------------------------------------------------------------------------
#Preprocessing train
corpus <- Corpus(VectorSource(train$tweets))
corpus <- tm_map(corpus,PlainTextDocument)
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus,removePunctuation)
corpus <- tm_map(corpus,removeWords,stopwords("english"))
corpus <- tm_map(corpus,stemDocument)
freq <- DocumentTermMatrix(corpus)
sparse <- removeSparseTerms(freq,.995)
tSparse <- as.data.frame(as.matrix(sparse))
colnames(tSparse) = make.names(colnames(tSparse))
tSparse$class = train$class

#Prepare data for XGBoost
train_indices <- create_train_test(nrow(tSparse), 0.8, train = TRUE)
test_indices <- create_train_test(nrow(tSparse), 0.8, train = FALSE)

tSparse$class <- as.factor(tSparse$class)
tSparse$class <- as.numeric(tSparse$class) - 1

trainSet <- xgb.DMatrix(data.matrix(tSparse[train_indices, -ncol(tSparse)]), label = tSparse$class[train_indices])
testSet <- xgb.DMatrix(data.matrix(tSparse[test_indices, -ncol(tSparse)]), label = tSparse$class[test_indices])

#Set up XGBoost parameters
params <- list(
  objective = "multi:softprob",
  eval_metric = "error"
)

#Train the XGBoost model
unique(tSparse$class)
## [1] 0 1 2 3
num_classes <- length(unique(tSparse$class))

params <- list(
  objective = "multi:softprob",#What Tye changed, original:binary:logistic
  eval_metric = "mlogloss",
  num_class = num_classes,
  eta = 0.3,
  max_depth = 6,
  min_child_weight = 1,
  subsample = 1,
  colsample_bytree = 1,
  gamma = 0
)

fit <- xgb.train(params,
                 data = trainSet,
                 nrounds = 100,
                 watchlist = list(test = testSet),
                 early_stopping_rounds = 10,
                 maximize = FALSE,
                 print_every_n = 10
)
## [1]  test-mlogloss:1.371232 
## Will train until test_mlogloss hasn't improved in 10 rounds.
## 
## [11] test-mlogloss:1.321458 
## [21] test-mlogloss:1.302363 
## [31] test-mlogloss:1.291269 
## [41] test-mlogloss:1.284058 
## [51] test-mlogloss:1.279335 
## [61] test-mlogloss:1.275780 
## [71] test-mlogloss:1.273490 
## [81] test-mlogloss:1.271704 
## [91] test-mlogloss:1.270742 
## [100]    test-mlogloss:1.270164
#Predict the test set
prediction_prob <- predict(fit, testSet, output_margin = TRUE)
prediction <- matrix(prediction_prob, ncol = num_classes, byrow = TRUE)
prediction <- max.col(prediction) - 1

true_labels <- tSparse$class[test_indices]
mean(true_labels == prediction)
## [1] 0.3952831
Confusion_Matrix <- function (Labels, Guesses) {
  Value_P <- function(Label, Guess){
    bin <- as.integer( #Returns int equivalent of binary value Label,Guess
      strtoi(
        paste0(Label * 10 + Guess), 
        base = 2
        )
      )
    
      arr <- c("TN",
               "FP", #Label = 0, Guess = 1
               "FN", #Label = 1, Guess = 0
               "TP" #Label = 1, Guess = 1
               )
    return(arr[bin+1])
  }
  
  result <- map2(.x = Labels, .y = Guesses,.f = Value_P) %>% unlist()
  TN_Count <- result[result == "TN"] %>% length()
  FP_Count <- result[result == "FP"] %>% length()
  FN_Count <- result[result == "FN"] %>% length()
  TP_Count <- result[result == "TP"] %>% length()
  
  # Return confusion matrix
  return(matrix(c(TN_Count, FP_Count, FN_Count, TP_Count), nrow = 2, byrow = TRUE))
}

confusion_matrix <- Confusion_Matrix(Labels = true_labels, Guesses = prediction)
print(confusion_matrix)
##       [,1]  [,2]
## [1,] 12886 13696
## [2,] 13055 14001
TN <- confusion_matrix[1, 1]
FP <- confusion_matrix[1, 2]
FN <- confusion_matrix[2, 1]
TP <- confusion_matrix[2, 2]

specificity <- TN / (TN + FP)
sensitivity <- TP / (TP + FN)

cat("Specificity:", specificity, "\n")
## Specificity: 0.4847641
cat("Sensitivity:", sensitivity, "\n")
## Sensitivity: 0.5174823
#-------------------------------------------------------------------------------
FP_Pie_Chart <- function(Labels, Guesses) {
  require('cowplot')
  require('ggrepel')
  require('grid')
  a_table <- Accuracy_Label_Table(Labels = Labels,
                     Guesses = Guesses)
  N_Acc <- round(a_table[1,2] / (a_table[1,2] + a_table[3,2]), digits = 4)
  P_Acc <- round(a_table[4,2] / (a_table[4,2] + a_table[2,2]), digits = 4)
  Acc <- round((a_table[1,2] + a_table[4,2]) / (a_table[1,2] + a_table[3,2] + a_table[4,2] + a_table[2,2]), digits = 4)

  plt <- a_table %>%
    ggplot(aes(x = "", y = value, fill = group)) +
    geom_col() + 
    geom_label(aes(label = value),
               position = position_stack(vjust = 0.5),
               show.legend = FALSE) +
    coord_polar(theta = "y") +
    scale_fill_manual(values = c("#FFABAB", "#FFB092",
                                 "#b4d4fa", "#BFFCC6"),
                      guide = guide_legend(reverse = TRUE)) + 
    ggtitle("TP, TN, FP, FN Pie Chart") +
    theme_void()

  plt <- ggdraw(plt)

  plt <- plt +
    annotation_custom(grob = textGrob(paste0("Accuracy Positive: ",P_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025, ymax = 1- .025) +
    annotation_custom(grob = textGrob(paste0("Accuracy Negative: ",N_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .05, ymax = 1- .025 - .05) +
    annotation_custom(grob = textGrob(paste0("Total Accuracy: ",Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .1, ymax = 1- .025 - .1)
  plt
}

#FP_Pie_Chart(Labels = true_labels, Guesses = prediction)
Tweets with Sarcasm without Hashtags(regular, sarcasm)
library('xgboost')
library('tm')
library('pacman')
library('tidyverse')
test <- read.csv("Datasets/Tweets_with_Sarcasm_and_Irony/test_without_hashtags.csv")
train <- read.csv("Datasets/Tweets_with_Sarcasm_and_Irony/train_without_hashtags.csv")
even_out_observations <- function(data){
  regular <- data %>% filter(class == 'regular')
  sarcasm <- data %>% filter(class != 'regular')
  sarcasm$class = "sarcasm"
  num_regular <- regular %>% nrow() 
  sarcasm <- sarcasm[1:num_regular,]
  data <- rbind(regular,sarcasm)
  data <- data[sample(1:nrow(data)), ]
  data
}

train <- train %>% filter(class!="")
train$class[train$class == "figurative"] = "sarcasm"
train$class[train$class == "irony"] = "sarcasm"
create_train_test <- function(data_size, size = 0.8, train = TRUE) {
  set.seed(123) # Set seed for reproducibility
  
  # Create shuffled indices
  shuffled_indices <- sample(1:data_size)
  
  # Calculate the number of rows for the train set
  train_rows <- round(size * data_size)
  
  if (train == TRUE) {
    return (shuffled_indices[1:train_rows])
  } else {
    return (shuffled_indices[(train_rows + 1):data_size])
  }
}

#CSC Project
#pacman::p_load(datasets,pacman, dplyr, GGally, ggplot2, ggthemes, ggvis,
 #              httr, lubridate, plotly, rio, rmarkdown, shiny,
#               stringr, tidyverse, lessR, aplpack, readr, tm, SnowballC, rpart.plot)
#class and tweets
#-----------------------------------------------------------------------------------------------------------------------------
#Preprocessing train
corpus <- Corpus(VectorSource(train$tweets))
corpus <- tm_map(corpus,PlainTextDocument)
corpus <- tm_map(corpus,tolower)
corpus <- tm_map(corpus,removePunctuation)
corpus <- tm_map(corpus,removeWords,stopwords("english"))
corpus <- tm_map(corpus,stemDocument)
freq <- DocumentTermMatrix(corpus)
sparse <- removeSparseTerms(freq,.995)
tSparse <- as.data.frame(as.matrix(sparse))
colnames(tSparse) = make.names(colnames(tSparse))
tSparse$class = train$class

#Prepare data for XGBoost
train_indices <- create_train_test(nrow(tSparse), 0.8, train = TRUE)
test_indices <- create_train_test(nrow(tSparse), 0.8, train = FALSE)

train = even_out_observations(train)
test = even_out_observations(test)

tSparse$class <- as.factor(tSparse$class)
tSparse$class <- as.numeric(tSparse$class) - 1

trainSet <- xgb.DMatrix(data.matrix(tSparse[train_indices, -ncol(tSparse)]), label = tSparse$class[train_indices])
testSet <- xgb.DMatrix(data.matrix(tSparse[test_indices, -ncol(tSparse)]), label = tSparse$class[test_indices])

#Set up XGBoost parameters
params <- list(
  objective = "binary:logistic",
  eval_metric = "error"
)

#Train the XGBoost model
unique(tSparse$class)
## [1] 1 0
num_classes <- length(unique(tSparse$class))

params <- list(
  objective = "multi:softprob",
  eval_metric = "mlogloss",
  num_class = num_classes,
  eta = 0.3,
  max_depth = 6,
  min_child_weight = 1,
  subsample = 1,
  colsample_bytree = 1,
  gamma = 0
)

fit <- xgb.train(params,
                 data = trainSet,
                 nrounds = 100,
                 watchlist = list(test = testSet),
                 early_stopping_rounds = 10,
                 maximize = FALSE,
                 print_every_n = 10
)
## [1]  test-mlogloss:0.612246 
## Will train until test_mlogloss hasn't improved in 10 rounds.
## 
## [11] test-mlogloss:0.503314 
## [21] test-mlogloss:0.487499 
## [31] test-mlogloss:0.477564 
## [41] test-mlogloss:0.471270 
## [51] test-mlogloss:0.466316 
## [61] test-mlogloss:0.462801 
## [71] test-mlogloss:0.460536 
## [81] test-mlogloss:0.458621 
## [91] test-mlogloss:0.457421 
## [100]    test-mlogloss:0.456465
#Predict the test set
prediction_prob <- predict(fit, testSet, output_margin = TRUE)
prediction <- matrix(prediction_prob, ncol = num_classes, byrow = TRUE)
prediction <- max.col(prediction) - 1

true_labels <- tSparse$class[test_indices]
mean(true_labels == prediction)
## [1] 0.779941
Accuracy_Label_Table <- function (Labels, Guesses) {
  Value_P <- function(Label, Guess){
  bin <- as.integer( #Returns int equivalent of binary value Label,Guess
    strtoi(
      paste0(Label * 10 + Guess), 
      base = 2
      )
    )
  
    arr <- c("TN",
             "FP", #Label = 0, Guess = 1
             "FN", #Label = 1, Guess = 0
             "TP" #Label = 1, Guess = 1
             )
  return(arr[bin+1])
}

result <- map2(.x = Labels, .y = Guesses,.f = Value_P) %>% unlist()
TN_Count <- result[result == "TN"] %>% length()
FP_Count <- result[result == "FP"] %>% length()
FN_Count <- result[result == "FN"] %>% length()
TP_Count <- result[result == "TP"] %>% length()

group = c("True Negative (TN)", #Label = 0, Guess = 0
          "False Positive (FP)", #Label = 0, Guess = 1
          "False Negative (FN)", #Label = 1, Guess = 0
          "True Positive (TP)" #Label = 1, Guess = 1
          )
value = c(TN_Count,
          FP_Count,
          FN_Count,
          TP_Count)

data.frame(group = group,
           value = value)
}
#-------------------------------------------------------------------------------
FP_Pie_Chart <- function(Labels, Guesses) {
  require('cowplot')
  require('ggrepel')
  require('grid')
  a_table <- Accuracy_Label_Table(Labels = Labels,
                     Guesses = Guesses)
  N_Acc <- round(a_table[1,2] / (a_table[1,2] + a_table[3,2]), digits = 4)
  P_Acc <- round(a_table[4,2] / (a_table[4,2] + a_table[2,2]), digits = 4)
  Acc <- round((a_table[1,2] + a_table[4,2]) / (a_table[1,2] + a_table[3,2] + a_table[4,2] + a_table[2,2]), digits = 4)

  plt <- a_table %>%
    ggplot(aes(x = "", y = value, fill = group)) +
    geom_col() + 
    geom_label(aes(label = value),
               position = position_stack(vjust = 0.5),
               show.legend = FALSE) +
    coord_polar(theta = "y") +
    scale_fill_manual(values = c("#FFABAB", "#FFB092",
                                 "#b4d4fa", "#BFFCC6"),
                      guide = guide_legend(reverse = TRUE)) + 
    ggtitle("TP, TN, FP, FN Pie Chart") +
    theme_void()

  plt <- ggdraw(plt)

  plt <- plt +
    annotation_custom(grob = textGrob(paste0("Accuracy Positive: ",P_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025, ymax = 1- .025) +
    annotation_custom(grob = textGrob(paste0("Accuracy Negative: ",N_Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .05, ymax = 1- .025 - .05) +
    annotation_custom(grob = textGrob(paste0("Total Accuracy: ",Acc)),  xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .1, ymax = 1- .025 - .1)
  plt
}

FP_Pie_Chart(Labels = true_labels, Guesses = prediction)